#################################################################
rm(list=ls())
#################################################################
# Dependencies
#################################################################
# global
library(dplyr)
library(magrittr)
library(pbmcapply)
library(readr)

setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
getwd()

annot <- read_rds('../../data/2019/02-minified-corpus_2021-06-09-CURATED.RDS')
corpus <- read_rds('../../data/2019/smd_all_curated_classified_sentiment.RDS')
corpus %<>% mutate(id=sprintf(paste0('%0',nchar(as.numeric(nrow(corpus))), 'd'), row_number()))
table(annot$doc.id%in%corpus$id)
str(corpus)
annot <- left_join(annot, corpus, by=c("doc.id"="id"))
annot <- filter(annot, !is.na(so))
annot %<>% select(doc.id, txt, person.id, so, so_txt, pubDateTime, la, tx, annotation_geography, annotation_person, selectsclass, sentiment_value)
annot %<>% rename(snippet=txt, fullTxt=tx)
unfolded <- pbmclapply(1:nrow(annot), function(x){ 
  tmp <- annot[rep(x, length(unlist(annot[x,]$person.id))),] 
  tmp$person.id <- unlist(tmp[1,]$person.id)
  return(tmp)
  }, mc.cores=4)
unfolded <- do.call(rbind, unfolded)

df <- read.csv('../../support/candidates-2019/00-Named_Entity_List_withID.csv', stringsAsFactors = F) %>% as_tibble %>% mutate(id=as.character(id))
table()
str(df)
table(df$party)
str(unfolded)
df <- left_join(unfolded, df, by=c("person.id"="id"))
filter(df, is.na(fullname))
table(df$party)
saveRDS(df, '../../data/2019/ner-SMD-annotated-corpus.RDS')
max(df$pubDateTime)
